{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AdventureWorks - Hard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "import findspark\n",
    "import pandas as pd\n",
    "findspark.init()\n",
    "\n",
    "SVR = '192.168.31.31'\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import *\n",
    "from pyspark.sql import Window\n",
    "\n",
    "sc = (SparkSession.builder.appName('app14-3') \n",
    "      .master(f'spark://{SVR}:7077') \n",
    "      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') \n",
    "      .config('spark.cores.max', '4') \n",
    "      .config('spark.executor.instances', '1') \n",
    "      .config('spark.executor.cores', '2') \n",
    "      .config('spark.executor.memory', '10g') \n",
    "      .enableHiveSupport().getOrCreate())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "cust_aw = sc.read.table('sqlzoo.CustomerAW')\n",
    "cust_addr = sc.read.table('sqlzoo.CustomerAddress')\n",
    "addr = sc.read.table('sqlzoo.Address')\n",
    "product = sc.read.table('sqlzoo.Product')\n",
    "order_det = sc.read.table('sqlzoo.SalesOrderDetail')\n",
    "order_head = sc.read.table('sqlzoo.SalesOrderHeader')\n",
    "prod_model = sc.read.table('sqlzoo.ProductModel')\n",
    "prod_model_prod = sc.read.table('sqlzoo.ProductModelProductDescription')\n",
    "prod_desc = sc.read.table('sqlzoo.ProductDescription')\n",
    "prod_cat = sc.read.table('sqlzoo.ProductCategory')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11.\n",
    "**For every customer with a 'Main Office' in Dallas show AddressLine1 of the 'Main Office' and AddressLine1 of the 'Shipping' address - if there is no shipping address leave it blank. Use one row per customer.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CompanyName</th>\n",
       "      <th>Main Office</th>\n",
       "      <th>Shipping</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Elite Bikes</td>\n",
       "      <td>Po Box 8259024</td>\n",
       "      <td>9178 Jumping St.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Rental Bikes</td>\n",
       "      <td>99828 Routh Street, Suite 825</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Third Bike Store</td>\n",
       "      <td>2500 North Stemmons Freeway</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Town Industries</td>\n",
       "      <td>P.O. Box 6256916</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Unsurpassed Bikes</td>\n",
       "      <td>Po Box 8035996</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         CompanyName                    Main Office          Shipping\n",
       "0        Elite Bikes                 Po Box 8259024  9178 Jumping St.\n",
       "1       Rental Bikes  99828 Routh Street, Suite 825                  \n",
       "2   Third Bike Store    2500 North Stemmons Freeway                  \n",
       "3    Town Industries               P.O. Box 6256916                  \n",
       "4  Unsurpassed Bikes                 Po Box 8035996                  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(cust_aw.join(cust_addr, 'CustomerID')\n",
    " .join(cust_aw\n",
    "       .join(cust_addr.filter(cust_addr['AddressType']=='Main Office'),\n",
    "             on='CustomerID')\n",
    "       .join(addr.filter(addr['City']=='Dallas'), on='AddressID') \n",
    "       .select('CustomerID')\n",
    "       .distinct(), on='CustomerID')\n",
    " .join(addr, on='AddressID')\n",
    " .select('CompanyName', 'AddressType', 'AddressLine1')\n",
    " .distinct()\n",
    " .groupBy('CompanyName')\n",
    " .pivot('AddressType', ['Main Office', 'Shipping'])\n",
    " .agg(first('AddressLine1'))\n",
    " .fillna('')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12.\n",
    "**For each order show the SalesOrderID and SubTotal calculated three ways:**\n",
    "\n",
    "- **A) From the SalesOrderHeader**\n",
    "- **B) Sum of OrderQty*UnitPrice**\n",
    "- **C) Sum of OrderQty*ListPrice**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SalesOrderID</th>\n",
       "      <th>SubTotal</th>\n",
       "      <th>sum(SubTotal)</th>\n",
       "      <th>sum(SubTotal)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>71774</td>\n",
       "      <td>880.35</td>\n",
       "      <td>713.8000</td>\n",
       "      <td>1189.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>71776</td>\n",
       "      <td>78.81</td>\n",
       "      <td>63.9000</td>\n",
       "      <td>106.50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>71780</td>\n",
       "      <td>38418.69</td>\n",
       "      <td>29922.8100</td>\n",
       "      <td>56651.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>71782</td>\n",
       "      <td>39785.33</td>\n",
       "      <td>33319.6800</td>\n",
       "      <td>55533.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>71783</td>\n",
       "      <td>83858.43</td>\n",
       "      <td>65682.7396</td>\n",
       "      <td>121625.43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>71784</td>\n",
       "      <td>108561.83</td>\n",
       "      <td>89868.8795</td>\n",
       "      <td>151932.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>71796</td>\n",
       "      <td>57634.63</td>\n",
       "      <td>47848.0200</td>\n",
       "      <td>79746.71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>71797</td>\n",
       "      <td>78029.69</td>\n",
       "      <td>65122.7911</td>\n",
       "      <td>108986.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>71815</td>\n",
       "      <td>1141.58</td>\n",
       "      <td>926.9100</td>\n",
       "      <td>1544.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>71816</td>\n",
       "      <td>3398.17</td>\n",
       "      <td>2847.3700</td>\n",
       "      <td>4745.68</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>71831</td>\n",
       "      <td>2016.34</td>\n",
       "      <td>1712.9100</td>\n",
       "      <td>2854.91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>71832</td>\n",
       "      <td>35775.21</td>\n",
       "      <td>28950.4766</td>\n",
       "      <td>50559.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>71845</td>\n",
       "      <td>41622.05</td>\n",
       "      <td>34118.3000</td>\n",
       "      <td>57768.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>71846</td>\n",
       "      <td>2453.76</td>\n",
       "      <td>1884.3800</td>\n",
       "      <td>3592.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>71856</td>\n",
       "      <td>602.19</td>\n",
       "      <td>500.3000</td>\n",
       "      <td>833.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>71858</td>\n",
       "      <td>13823.71</td>\n",
       "      <td>11528.8000</td>\n",
       "      <td>19214.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>71863</td>\n",
       "      <td>3324.28</td>\n",
       "      <td>2777.0538</td>\n",
       "      <td>4633.78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>71867</td>\n",
       "      <td>1059.31</td>\n",
       "      <td>858.9000</td>\n",
       "      <td>1431.50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>71885</td>\n",
       "      <td>550.39</td>\n",
       "      <td>524.6400</td>\n",
       "      <td>874.44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>71895</td>\n",
       "      <td>246.74</td>\n",
       "      <td>221.2400</td>\n",
       "      <td>368.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>71897</td>\n",
       "      <td>12685.89</td>\n",
       "      <td>10585.0100</td>\n",
       "      <td>17641.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>71898</td>\n",
       "      <td>63980.99</td>\n",
       "      <td>53248.5700</td>\n",
       "      <td>88747.82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>71899</td>\n",
       "      <td>2415.67</td>\n",
       "      <td>1856.1800</td>\n",
       "      <td>3545.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>71902</td>\n",
       "      <td>74058.81</td>\n",
       "      <td>59893.7200</td>\n",
       "      <td>106151.57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>71915</td>\n",
       "      <td>2137.23</td>\n",
       "      <td>1732.8600</td>\n",
       "      <td>2888.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>71917</td>\n",
       "      <td>40.90</td>\n",
       "      <td>37.7300</td>\n",
       "      <td>62.93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>71920</td>\n",
       "      <td>2980.79</td>\n",
       "      <td>2527.0800</td>\n",
       "      <td>4211.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>71923</td>\n",
       "      <td>106.54</td>\n",
       "      <td>96.0312</td>\n",
       "      <td>166.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>71935</td>\n",
       "      <td>6634.30</td>\n",
       "      <td>5533.7754</td>\n",
       "      <td>9229.27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>71936</td>\n",
       "      <td>98278.69</td>\n",
       "      <td>79589.0848</td>\n",
       "      <td>138124.87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>71938</td>\n",
       "      <td>88812.86</td>\n",
       "      <td>5102.9500</td>\n",
       "      <td>8504.95</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    SalesOrderID   SubTotal  sum(SubTotal)  sum(SubTotal)\n",
       "0          71774     880.35       713.8000        1189.66\n",
       "1          71776      78.81        63.9000         106.50\n",
       "2          71780   38418.69     29922.8100       56651.56\n",
       "3          71782   39785.33     33319.6800       55533.31\n",
       "4          71783   83858.43     65682.7396      121625.43\n",
       "5          71784  108561.83     89868.8795      151932.58\n",
       "6          71796   57634.63     47848.0200       79746.71\n",
       "7          71797   78029.69     65122.7911      108986.40\n",
       "8          71815    1141.58       926.9100        1544.86\n",
       "9          71816    3398.17      2847.3700        4745.68\n",
       "10         71831    2016.34      1712.9100        2854.91\n",
       "11         71832   35775.21     28950.4766       50559.01\n",
       "12         71845   41622.05     34118.3000       57768.21\n",
       "13         71846    2453.76      1884.3800        3592.65\n",
       "14         71856     602.19       500.3000         833.84\n",
       "15         71858   13823.71     11528.8000       19214.74\n",
       "16         71863    3324.28      2777.0538        4633.78\n",
       "17         71867    1059.31       858.9000        1431.50\n",
       "18         71885     550.39       524.6400         874.44\n",
       "19         71895     246.74       221.2400         368.76\n",
       "20         71897   12685.89     10585.0100       17641.75\n",
       "21         71898   63980.99     53248.5700       88747.82\n",
       "22         71899    2415.67      1856.1800        3545.67\n",
       "23         71902   74058.81     59893.7200      106151.57\n",
       "24         71915    2137.23      1732.8600        2888.15\n",
       "25         71917      40.90        37.7300          62.93\n",
       "26         71920    2980.79      2527.0800        4211.88\n",
       "27         71923     106.54        96.0312         166.81\n",
       "28         71935    6634.30      5533.7754        9229.27\n",
       "29         71936   98278.69     79589.0848      138124.87\n",
       "30         71938   88812.86      5102.9500        8504.95"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(order_head.select('SalesOrderID', 'SubTotal')\n",
    " .join(order_det\n",
    "       .withColumn('SubTotal', col('OrderQty') * col('UnitPrice') * \n",
    "                   (lit(1)-col('UnitPriceDiscount')))\n",
    "       .groupBy(col('SalesOrderID'))\n",
    "       .sum('SubTotal'),\n",
    "      on='SalesOrderID')\n",
    " .join(order_det\n",
    "       .join(product, on='ProductID')\n",
    "       .withColumn('SubTotal', col('OrderQty') * col('ListPrice'))\n",
    "       .groupBy('SalesOrderID')\n",
    "       .sum('SubTotal'),\n",
    "      on='SalesOrderID')\n",
    " .orderBy('SalesOrderID')\n",
    " .toPandas())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13.\n",
    "**Show the best selling item by value.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ProductID</th>\n",
       "      <th>Name</th>\n",
       "      <th>sum(SubTotal)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>969</td>\n",
       "      <td>Touring-1000 Blue, 60</td>\n",
       "      <td>37191.44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>783</td>\n",
       "      <td>Mountain-200 Black, 42</td>\n",
       "      <td>37178.73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>782</td>\n",
       "      <td>Mountain-200 Black, 38</td>\n",
       "      <td>35801.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>976</td>\n",
       "      <td>Road-350-W Yellow, 48</td>\n",
       "      <td>33509.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>957</td>\n",
       "      <td>Touring-1000 Yellow, 60</td>\n",
       "      <td>23745.32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>907</td>\n",
       "      <td>Rear Brakes</td>\n",
       "      <td>63.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133</th>\n",
       "      <td>913</td>\n",
       "      <td>HL Road Seat/Saddle</td>\n",
       "      <td>63.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>134</th>\n",
       "      <td>874</td>\n",
       "      <td>Racing Socks, M</td>\n",
       "      <td>59.29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>947</td>\n",
       "      <td>HL Touring Handlebars</td>\n",
       "      <td>54.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>873</td>\n",
       "      <td>Patch Kit/8 Patches</td>\n",
       "      <td>27.40</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>137 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     ProductID                     Name  sum(SubTotal)\n",
       "0          969    Touring-1000 Blue, 60       37191.44\n",
       "1          783   Mountain-200 Black, 42       37178.73\n",
       "2          782   Mountain-200 Black, 38       35801.74\n",
       "3          976    Road-350-W Yellow, 48       33509.58\n",
       "4          957  Touring-1000 Yellow, 60       23745.32\n",
       "..         ...                      ...            ...\n",
       "132        907              Rear Brakes          63.90\n",
       "133        913      HL Road Seat/Saddle          63.16\n",
       "134        874          Racing Socks, M          59.29\n",
       "135        947    HL Touring Handlebars          54.94\n",
       "136        873      Patch Kit/8 Patches          27.40\n",
       "\n",
       "[137 rows x 3 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(order_det.join(product, on='ProductID')\n",
    " .withColumn('SubTotal', col('OrderQty') * col('UnitPrice'))\n",
    " .groupBy('ProductID', 'Name')\n",
    " .sum('SubTotal')\n",
    " .orderBy(col('Sum(SubTotal)').desc())\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 14.\n",
    "**Show how many orders are in the following ranges (in $):**\n",
    "\n",
    "```\n",
    "    RANGE      Num Orders      Total Value\n",
    "    0-  99\n",
    "  100- 999\n",
    " 1000-9999\n",
    "10000-\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RANGE</th>\n",
       "      <th>count(SubTotal)</th>\n",
       "      <th>sum(SubTotal)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0-  99</td>\n",
       "      <td>3</td>\n",
       "      <td>158.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>100- 999</td>\n",
       "      <td>5</td>\n",
       "      <td>2386.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1000-9999</td>\n",
       "      <td>10</td>\n",
       "      <td>27561.43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10000-</td>\n",
       "      <td>14</td>\n",
       "      <td>835326.81</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        RANGE  count(SubTotal)  sum(SubTotal)\n",
       "0      0-  99                3         158.66\n",
       "1    100- 999                5        2386.21\n",
       "2   1000-9999               10       27561.43\n",
       "3  10000-                   14      835326.81"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(order_head\n",
    " .withColumn('RANGE', \n",
    "             when(col('SubTotal').between(0, 99.99), '    0-  99')\n",
    "             .when(col('SubTotal').between(100, 999.99), '  100- 999')\n",
    "             .when(col('SubTotal').between(1000, 9999.99), ' 1000-9999')\n",
    "             .otherwise('10000-    '))\n",
    " .groupBy('RANGE')\n",
    " .agg(count('SubTotal'), sum('SubTotal'))\n",
    " .orderBy('RANGE')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 15.\n",
    "**Identify the three most important cities. Show the break down of top level product category against city.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>catg_name</th>\n",
       "      <th>sum(amount)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>London</td>\n",
       "      <td>Bottom Brackets</td>\n",
       "      <td>388.73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>London</td>\n",
       "      <td>Brakes</td>\n",
       "      <td>255.60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>London</td>\n",
       "      <td>Chains</td>\n",
       "      <td>36.42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>London</td>\n",
       "      <td>Cranksets</td>\n",
       "      <td>1773.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>London</td>\n",
       "      <td>Derailleurs</td>\n",
       "      <td>638.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>London</td>\n",
       "      <td>Gloves</td>\n",
       "      <td>88.14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>London</td>\n",
       "      <td>Handlebars</td>\n",
       "      <td>292.63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>London</td>\n",
       "      <td>Helmets</td>\n",
       "      <td>20.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>London</td>\n",
       "      <td>Mountain Bikes</td>\n",
       "      <td>50881.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>London</td>\n",
       "      <td>Mountain Frames</td>\n",
       "      <td>24018.80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>London</td>\n",
       "      <td>Pedals</td>\n",
       "      <td>390.32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>London</td>\n",
       "      <td>Road Bikes</td>\n",
       "      <td>5102.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>London</td>\n",
       "      <td>Saddles</td>\n",
       "      <td>284.43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>London</td>\n",
       "      <td>Shorts</td>\n",
       "      <td>1072.14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Bike Racks</td>\n",
       "      <td>144.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Bottles and Cages</td>\n",
       "      <td>29.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Caps</td>\n",
       "      <td>57.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Cleaners</td>\n",
       "      <td>38.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Gloves</td>\n",
       "      <td>264.42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Handlebars</td>\n",
       "      <td>72.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Helmets</td>\n",
       "      <td>721.69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Hydration Packs</td>\n",
       "      <td>296.91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Jerseys</td>\n",
       "      <td>1597.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Pedals</td>\n",
       "      <td>490.73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Road Bikes</td>\n",
       "      <td>53478.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Road Frames</td>\n",
       "      <td>10031.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Socks</td>\n",
       "      <td>5.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Tires and Tubes</td>\n",
       "      <td>8.22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>Union City</td>\n",
       "      <td>Vests</td>\n",
       "      <td>904.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Bike Racks</td>\n",
       "      <td>432.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Bottles and Cages</td>\n",
       "      <td>31.79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Caps</td>\n",
       "      <td>53.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Cleaners</td>\n",
       "      <td>38.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Gloves</td>\n",
       "      <td>161.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Helmets</td>\n",
       "      <td>495.36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Hydration Packs</td>\n",
       "      <td>296.91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Jerseys</td>\n",
       "      <td>1178.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Saddles</td>\n",
       "      <td>63.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Tires and Tubes</td>\n",
       "      <td>10.96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Touring Bikes</td>\n",
       "      <td>77040.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Touring Frames</td>\n",
       "      <td>9430.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>Woolston</td>\n",
       "      <td>Vests</td>\n",
       "      <td>1108.19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City          catg_name  sum(amount)\n",
       "0       London    Bottom Brackets       388.73\n",
       "1       London             Brakes       255.60\n",
       "2       London             Chains        36.42\n",
       "3       London          Cranksets      1773.81\n",
       "4       London        Derailleurs       638.85\n",
       "5       London             Gloves        88.14\n",
       "6       London         Handlebars       292.63\n",
       "7       London            Helmets        20.99\n",
       "8       London     Mountain Bikes     50881.99\n",
       "9       London    Mountain Frames     24018.80\n",
       "10      London             Pedals       390.32\n",
       "11      London         Road Bikes      5102.95\n",
       "12      London            Saddles       284.43\n",
       "13      London             Shorts      1072.14\n",
       "14  Union City         Bike Racks       144.00\n",
       "15  Union City  Bottles and Cages        29.90\n",
       "16  Union City               Caps        57.31\n",
       "17  Union City           Cleaners        38.16\n",
       "18  Union City             Gloves       264.42\n",
       "19  Union City         Handlebars        72.16\n",
       "20  Union City            Helmets       721.69\n",
       "21  Union City    Hydration Packs       296.91\n",
       "22  Union City            Jerseys      1597.49\n",
       "23  Union City             Pedals       490.73\n",
       "24  Union City         Road Bikes     53478.76\n",
       "25  Union City        Road Frames     10031.90\n",
       "26  Union City              Socks         5.39\n",
       "27  Union City    Tires and Tubes         8.22\n",
       "28  Union City              Vests       904.95\n",
       "29    Woolston         Bike Racks       432.00\n",
       "30    Woolston  Bottles and Cages        31.79\n",
       "31    Woolston               Caps        53.90\n",
       "32    Woolston           Cleaners        38.16\n",
       "33    Woolston             Gloves       161.59\n",
       "34    Woolston            Helmets       495.36\n",
       "35    Woolston    Hydration Packs       296.91\n",
       "36    Woolston            Jerseys      1178.02\n",
       "37    Woolston            Saddles        63.16\n",
       "38    Woolston    Tires and Tubes        10.96\n",
       "39    Woolston      Touring Bikes     77040.15\n",
       "40    Woolston     Touring Frames      9430.95\n",
       "41    Woolston              Vests      1108.19"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(addr\n",
    " .join(addr\n",
    "       .join(order_head, on=(addr['AddressID']==order_head['ShipToAddressID']))\n",
    "       .groupBy('City')\n",
    "       .sum('SubTotal')\n",
    "       .orderBy(col('sum(SubTotal)').desc())\n",
    "       .limit(3),\n",
    "      on='City')\n",
    " .join(order_head, on=(addr['AddressID']==order_head['ShipToAddressID']))\n",
    " .join(order_det, on='SalesOrderID')\n",
    " .join(product, on='ProductID')\n",
    " .join(prod_cat.withColumnRenamed('name', 'catg_name'), on='ProductCategoryID')\n",
    " .withColumn('amount', col('OrderQty') * col('UnitPrice'))\n",
    " .groupBy('City', 'catg_name')\n",
    " .sum('amount')\n",
    " .orderBy('City', 'catg_name')\n",
    " .toPandas())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
